The settings list settings configure which parts, simulations, and analysis steps of this R markdown are executed. This way single components of the analysis can be disabled for the purpose of saving computation time. Setting settings$all to TRUE will set all other options to true regardless of their value.
settings <- list(
# If true sets all other settings to TRUE
all=F,
# Generate interactive plots of co-expression structures
investigate.coexpression=T,
# Perform Differential Expression Analysis
perform.DEA=T,
# Generate paper figures
generate.paper.figs=T,
# Specify if paper figures are exported
export.figures = F,
# Path for file exports
paper.fig.path = "../danawriteup/figs/",
# Clears environment
debug=TRUE
)
if(settings$all) {
settings$investigate.coexpression=TRUE
settings$perform.DEA=TRUE
settings$generate.paper.figs=TRUE
}
if(settings$debug) rm(list=ls()[ls()!="settings"])
The config list contains all parameters for the analysis. Remember to always set the working directory and the paths to the data/external files prior to running the code.
config <- list(
DANA.path = "./R/DANA.R",
## Data
case.name = "TCGA",
# Read count data file for the full data set
data.file.path = "data/TCGA_harmonized_BRCA_UCS.RData",
# RData file for coordinates data frame based on miRBase miRNA definitions
coords.file.path = "data/miRBase.coords.RData",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = TRUE,
# thresholds for zero-expressed, poorly-expressed and well-expressed genes
t.zero = 2, # zero-expressed in [0, t.zero)
t.poor = 5, # poorly-expressed in [t.zero, t.well]
t.well = 100, # well-expressed in [t.well, inf)
# distance threshold for clustering
cluster.threshold = 10000,
# preprocessing data transformation type: none ("n"), modified log2 ("log2"),
# or cube root ("cube") available
preprocess.transform = "log2",
## Correlation Estimation for positive and negative controls
# Available methods | Tuning parameter calibration schemes
# "mb" | "cv", "aic", "bic", "av"
# "huge.mb" | "ric", "stars"
# "glasso" | "ric", "stars", "ebic"
# "fastggm" | none
# "pearson" | none
# "spearman" | none
# Positive Controls
corr.method.pos = "mb",
tuning.criterion.pos = "bic",
# Negative Controls
corr.method.neg = "pearson",
tuning.criterion.neg = "",
# Generate plots within DANA
generate.plots = FALSE
)
Configuration for the differential expression analysis
config.DEA <- list(
## Data
case.name = "TCGA",
## Normalization Methods
# Specify which normalization methods will be applied
norm.apply.TC = TRUE,
norm.apply.UQ = TRUE,
norm.apply.med = TRUE,
norm.apply.TMM = TRUE,
norm.apply.DESeq = TRUE,
norm.apply.PoissonSeq = TRUE,
norm.apply.QN = TRUE,
norm.apply.RUV = FALSE,
# Significance level for DEA
alpha = 0.01,
# Plots
generate.volcano.plot = TRUE,
generate.meanvar.plot = TRUE,
# Use q-values (adjusted p-values) instead of p-values
q.values = FALSE,
# RUV reduces the parameter size. Reduce DEA result to RUV genes?
perform.subsetting = FALSE
)
Load all required R libraries/packages.
# DANA functions
source(config$DANA.path)
## Warning: package 'ggplot2' was built under R version 4.0.3
## Warning: package 'gridExtra' was built under R version 4.0.3
## Warning: package 'ggnewscale' was built under R version 4.0.3
## Warning: package 'corrplot' was built under R version 4.0.3
## Warning: package 'stargazer' was built under R version 4.0.3
## Warning: package 'plotly' was built under R version 4.0.3
## Warning: package 'ggrepel' was built under R version 4.0.3
## Warning: package 'glmnet' was built under R version 4.0.3
## Warning: package 'huge' was built under R version 4.0.3
# Libraries
library(openxlsx) # read excel files
library(corrplot) # For mixed correlation plots
library(cowplot) # Arrange plots
## Warning: package 'cowplot' was built under R version 4.0.3
library(RColorBrewer) # Colors
library(latex2exp) # for latex in axis labels
library(ggcorrplot) # ggplot2 correlation plots
## Warning: package 'ggcorrplot' was built under R version 4.0.3
We load the data set into memory.
# Load the p x n matrix of read counts into the workspace
load(config$data.file.path)
# Unlist the data
data.BRCA <- TCGA.BRCA.UCS$BRCA
data.UCS <- TCGA.BRCA.UCS$UCS
# Transform gene names to lower case
rownames(data.BRCA) <- tolower(rownames(data.BRCA))
rownames(data.UCS) <- tolower(rownames(data.UCS))
# Build single RC matrix
data.RC <- data.TCGA <- cbind(data.BRCA, data.UCS)
data.groups <- c(rep("BRCA", dim(data.BRCA)[2]), rep("UCS", dim(data.UCS)[2]))
# Inspect
cat("Dimensions of the data: ", dim(data.RC), "\n")
## Dimensions of the data: 1881 223
A coordinates data frame that specifies the base pair location of each miRNA of the data set on the chromosomes is loaded.
# Load miRNA chromosome location coordinates data frame
load(config$coords.file.path)
coords <- coords # change according to the name of the loaded data matrix
Some miRNAs map to multiple locations of sequence families. We named the sequence family with a parenthesis that reflects the number of members from all the different genomic locations (e.g. hsa-let-7a(3)). These miRNAs cannot be uniquely mapped to the genome, therefore we must exclude these from the location based analysis.
# only consider genes that are present in the coordinate data frame
genes.in.coords <- rownames(coords)[na.omit(match(rownames(data.RC), rownames(coords)))]
cat(dim(data.RC)[1]-length(genes.in.coords), " genes not found in coords. Reducing data set to ", length(genes.in.coords), "genes.\n")
## 33 genes not found in coords. Reducing data set to 1848 genes.
# test data set
data.RC <- data.RC[genes.in.coords, ]
# benchmark data set
genes <- rownames(data.RC)
rm(genes.in.coords)
First, we investigate the distribution of mean miRNA expression of the data.
# Histogram plot test data set
par(mfrow=c(1,1))
df <- data.frame(log.expression=log2(rowMeans(data.RC)+1))
print(ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("Test data set"))
rm(df)
We observe that there is a large proportion of poorly-expressed genes. Some of them have extremely low or zero mean expression which corresponds to essentially zero reads.
We apply the full analysis pipeline to the data set. This includes:
The following parameters are used:
genes <- rownames(data.RC)
## Apply Normalization
data.norm <- normalize(data.RC,
groups=data.groups,
name=config$case.name,
apply.TC=config$norm.apply.TC,
apply.UQ=config$norm.apply.UQ,
apply.med=config$norm.apply.med,
apply.TMM=config$norm.apply.TMM,
apply.DESeq=config$norm.apply.DESeq,
apply.PoissonSeq=config$norm.apply.PoissonSeq,
apply.QN=config$norm.apply.QN,
apply.RUV=config$norm.apply.RUV)
## 1096 genes has been filtered because they contains too small number of reads across the experiments.
## Define Controls
pre.res <- define.controls(data.RC,
t.zero=config$t.zero,
t.poor=config$t.poor,
t.well=config$t.well,
t.cluster=config$cluster.threshold,
coords=coords,
clusters=NULL)
## Number of positive control markers with RC in [100, inf): 116
## Number of negative control markers with RC in [2, 5]: 91
pos.controls <- pre.res$genes.pos
neg.controls <- pre.res$genes.neg
clusters <- pre.res$clusters
clusters.data <- clusters
# Apply DANA pipeline
DANA.results <- DANA(data.RC=data.RC,
data.norm=data.norm,
pos.controls=pos.controls,
neg.controls=neg.controls,
clusters=clusters.data,
coords=coords,
case.name="benchmark",
generate.plots=config$generate.plots,
preprocess.transform=config$preprocess.transform,
corr.method.pos=config$corr.method.pos,
tuning.criterion.pos=config$tuning.criterion.pos,
corr.method.neg=config$corr.method.neg,
tuning.criterion.neg=config$tuning.criterion.neg)
## Estimating correlations for pos. and neg. controls for data set benchmark...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TMM...done
## Estimating correlations for pos. and neg. controls for data set TCGA.DESeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.PoissonSeq...done
## Estimating correlations for pos. and neg. controls for data set TCGA.TC...done
## Estimating correlations for pos. and neg. controls for data set TCGA.Med...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVg...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVs...done
## Estimating correlations for pos. and neg. controls for data set TCGA.RUVr...done
## Estimating correlations for pos. and neg. controls for data set TCGA.UQ...done
## Estimating correlations for pos. and neg. controls for data set TCGA.QN...done
## Comparing models benchmark vs. TCGA.TMM....done
## Comparing models benchmark vs. TCGA.DESeq....done
## Comparing models benchmark vs. TCGA.PoissonSeq....done
## Comparing models benchmark vs. TCGA.TC....done
## Comparing models benchmark vs. TCGA.Med....done
## Comparing models benchmark vs. TCGA.RUVg....done
## Comparing models benchmark vs. TCGA.RUVs....done
## Comparing models benchmark vs. TCGA.RUVr....done
## Comparing models benchmark vs. TCGA.UQ....done
## Comparing models benchmark vs. TCGA.QN....done
if(!config$generate.plots) {
stargazer(DANA.results$metrics, type="text", summary=FALSE, digits=4,
title="Comparison statistics for the TCGA-BRCA/UCS data set", align=TRUE)
}
##
## Comparison statistics for the TCGA-BRCA/UCS data set
## =============================
## cc mcr
## -----------------------------
## TCGA.TMM 0.9904 0.7154
## TCGA.DESeq 0.9896 0.7332
## TCGA.PoissonSeq 0.9882 0.7166
## TCGA.TC 0.9927 0.5344
## TCGA.Med 0.9245 0.3628
## TCGA.RUVg 0.9854 0.7167
## TCGA.RUVs 0.8288 0.8121
## TCGA.RUVr 0.9763 0.7767
## TCGA.UQ 0.9374 0.3981
## TCGA.QN 0.9717 0.7834
## -----------------------------
iplot.data.noNorm <- iggplot.corr(DANA.results$data.model$pos$corr, clusters=clusters, title="Positive controls (un-normalized)", coords=coords)
iplot.data.TMM <- iggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr, clusters=clusters, title="Positive controls (TMM)", coords=coords)
We compare the estimated co-expression structures for the test and benchmark dataset. Here, we take a look at the models for the un-normalized data as well as TMM-normalized data. Note that you cannot directly compare the graphs of benchmark with test data since the considered set of genes is not identical.
if(settings$investigate.coexpression) {
htmltools::tagList(list(iplot.data.noNorm, iplot.data.TMM)) # show plots in markdown
}
if(settings$perform.DEA) {
## Reset data
data.RC <- data.TCGA
## Normalize test Data
data.norm <- normalize(data.RC,
groups=data.groups,
name="TCGA",
apply.TC=config.DEA$norm.apply.TC,
apply.UQ=config.DEA$norm.apply.UQ,
apply.med=config.DEA$norm.apply.med,
apply.TMM=config.DEA$norm.apply.TMM,
apply.DESeq=config.DEA$norm.apply.DESeq,
apply.PoissonSeq=config.DEA$norm.apply.PoissonSeq,
apply.QN=config.DEA$norm.apply.QN,
apply.RUV=FALSE)
# RUV normalization
if(config.DEA$norm.apply.RUV) {
data.norm.RUV.adjust <- list(test.RUVr=norm.RUV(data.RC, data.groups, "RUVr")$adjust.factor,
test.RUVs=norm.RUV(data.RC, data.groups, "RUVs")$adjust.factor,
test.RUVg=norm.RUV(data.RC, data.groups, "RUVg")$adjust.factor)
}
}
## 1124 genes has been filtered because they contains too small number of reads across the experiments.
if(settings$perform.DEA) {
## Perform DEA on full dataset
test.DEA <- DE.voom(data.RC=data.RC, groups=data.groups, plot=config.DEA$generate.meanvar.plot, plot.title="Un-normalized")
if(config.DEA$generate.volcano.plot) plot.DE.volcano(test.DEA, alpha=config.DEA$alpha, q.values=config.DEA$q.values, title="Un-normalized")
## DEA for normalized test data
data.norm.DEA <- list()
for(i in 1:length(data.norm)) {
data.norm.DEA <-
append(data.norm.DEA, list(DE.voom(data.RC=data.norm[[i]],
groups=data.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(data.norm)[i])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(data.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(data.norm)[i])
}
}
## DEA for RUV normalization
if(config.DEA$norm.apply.RUV) {
for(i in 1:length(data.norm.RUV.adjust)) {
data.norm.DEA <-
append(data.norm.DEA, list(DE.voom(data.RC=data.RC,
groups=data.groups,
plot=config.DEA$generate.meanvar.plot,
plot.title=names(data.norm.RUV.adjust)[i],
adjust=data.norm.RUV.adjust[[i]])))
if(config.DEA$generate.volcano.plot) {
plot.DE.volcano(data.norm.DEA[[i]],
alpha=config.DEA$alpha,
q.values=config.DEA$q.values,
title=names(data.norm.RUV.adjust)[i])
}
}
names(data.norm.DEA) <- c(names(data.norm), names(data.norm.RUV.adjust))
} else {
names(data.norm.DEA) <- names(data.norm)
}
}
## number of DE genes: 1515
## number of DE genes: 1552
## number of DE genes: 1586
## number of DE genes: 1243
## number of DE genes: 597
## number of DE genes: 1198
## number of DE genes: 1134
## number of DE genes: 554
if(settings$generate.paper.figs) {
# Dimension of data after analysis
cat("Data:\n")
cat(" - Dimensions: p=", dim(data.RC)[1],", n=", dim(data.RC)[2], "\n", sep="")
cat(" - Positive controls:\n")
cat(" * Definition mean RC in interval: [", config$t.well, ", inf ]\n", sep="")
cat(" * Number of positive controls miRNAs:", length(pos.controls), "\n")
cat(" - Negative controls:\n")
cat(" * Definition mean RC in interval: [", config$t.zero, ",", config$t.poor, "]\n")
cat(" * Number of negative controls miRNAs:", length(neg.controls), "\n")
}
## Data:
## - Dimensions: p=1881, n=223
## - Positive controls:
## * Definition mean RC in interval: [100, inf ]
## * Number of positive controls miRNAs: 116
## - Negative controls:
## * Definition mean RC in interval: [ 2 , 5 ]
## * Number of negative controls miRNAs: 91
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# Histogram plot test data set
data.RC.log2 <- log2(rowMeans(data.RC)+1)
df <- data.frame(log.expression=data.RC.log2)
p.data.RC.hist <- ggplot(df, aes(x=log.expression)) +
geom_histogram(binwidth = .1, color="black", fill="black") +
geom_vline(xintercept = log2(config$t.zero+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.poor+1), color="blue", linetype="dashed") +
geom_vline(xintercept = log2(config$t.well+1), color="red", linetype="dashed") +
ggtitle("TCGA-BRCA/UCS data set") +
theme_classic()
print(p.data.RC.hist)
p.RC.hist <- ggplot(df) +
theme_classic() +
geom_histogram(aes(x=log.expression, y=..count..), binwidth = .1) +
geom_vline(xintercept = log2(config$t.zero+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.poor+1), color="#5851b8", linetype="twodash") +
geom_vline(xintercept = log2(config$t.well+1), color="#E7298A", linetype="longdash") +
geom_segment(aes(x = log2(config$t.zero+1), y = 600, xend = log2(config$t.poor+1), yend = 600),
arrow=arrow(length=unit(.07, "in"), ends="both"),
color="#5851b8") +
annotate(geom="label", x=(log2(config$t.zero+1)+log2(config$t.poor+1))/2.0, y=630,
label="neg. contr.", color="#5851b8", size=4, fill = alpha(c("white"),0.85), label.size = NA) +
geom_segment(aes(x = log2(config$t.well+1), y = 600, xend = log2(config$t.well+1)+4, yend = 600),
arrow=arrow(length=unit(.07, "in"), ends="last"),
color="#E7298A") +
annotate(geom="label", x=log2(config$t.well+1)+.5, y=630,
label="pos. contr.", color="#E7298A", size=4, hjust=0, fill = alpha(c("white"),0.85), label.size = NA) +
xlab("Mean log2(read count)") +
ylab("Frequency")
print(p.RC.hist)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_RC_Distribution.pdf"), p.RC.hist, width=5, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
# Function for mean-variance plot for a given data set
mean.variance.plot <- function(data.RC, title, axis.limit=NULL) {
df <- data.frame(data.mean=log10(rowMeans(data.RC) + 1),
data.var =log10(rowVars(data.RC) + 1))
lin.fit <- lm(df$data.var ~ df$data.mean)$coefficients
p <- ggplot(df,aes(x=data.mean,y=data.var)) +
geom_point(alpha=.25) +
xlab("log10(Mean)") +
ylab("log10(Variance)") +
geom_abline(intercept = lin.fit[1], slope = lin.fit[2], color="red", linetype="longdash", size=1) +
geom_abline(intercept = 0, slope = 1, colour="blue") +
annotate("text", alpha=1, x=4, y=0.5, hjust=0, vjust=0, size=3, colour="red",
label=paste0("log10(Variance) = ", round(lin.fit[1],2), " + ", round(lin.fit[2],2), "*log10(Mean)")) +
xlim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ylim(0, ifelse(is.null(axis.limit), max(df), axis.limit)) +
ggtitle(title) +
theme(aspect.ratio=1, plot.title = element_text(hjust = 0.5)) +
theme_bw()
return(p)
}
# Both subtypes
p.meanvar <- mean.variance.plot(data.RC, "Full data set", axis.limit=12.5)
print(p.meanvar)
# BRCA
p.meanvar.BRCA <- mean.variance.plot(data.RC[, data.groups=="BRCA"], "BRCA", axis.limit=12.5)
print(p.meanvar.BRCA)
# UCS
p.meanvar.UCS <- mean.variance.plot(data.RC[, data.groups=="UCS"], "UCS", axis.limit=12.5)
print(p.meanvar.UCS)
}
if(settings$generate.paper.figs) {
# Both subtypes
p.meansd <- plot.mean.sd(data.RC, config$t.zero, config$t.poor, config$t.well, "Full data set")
print(p.meansd)
# BRCA
p.meansd.BRCA <- plot.mean.sd(data.RC[, data.groups=="BRCA"], config$t.zero, config$t.poor, config$t.well, "BRCA")
print(p.meansd.BRCA)
# UCS
p.meansd.UCS <- plot.mean.sd(data.RC[, data.groups=="UCS"], config$t.zero, config$t.poor, config$t.well, "UCS")
print(p.meansd.UCS)
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_MeanSD.pdf"), p.meansd + labs(title = NULL), width = 5, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
num.genes.plot.pos <- 60
num.genes.plot.neg <- 60
# Co-expression models
data.model <- DANA.results$data.model
data.norm.model <- DANA.results$norm.models$TCGA.DESeq
# Common control miRNAs
pos.controls <- rownames(data.model$pos$corr)
pos.controls.norm <- rownames(data.norm.model$pos$corr)
pos.controls.common <- intersect(pos.controls, pos.controls.norm)
neg.controls <- rownames(data.model$neg$corr)
neg.controls.norm <- rownames(data.norm.model$neg$corr)
# reduce the number of genes for the plot
num.genes.plot.pos <- min(num.genes.plot.pos, length(pos.controls.common))
pos.controls.plot <- pos.controls.common[1:num.genes.plot.pos]
num.genes.plot.neg <- min(num.genes.plot.neg,
dim(data.model$neg$corr)[1],
dim(data.norm.model$neg$corr)[1])
# Subsetted correlation matrices
corr.data.pos <- data.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.data.DESeq.pos <- data.norm.model$pos$corr[pos.controls.plot, pos.controls.plot]
corr.data.neg <- data.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.data.DESeq.neg <- data.norm.model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
## Generate plots
# Positive controls
p.corr.pos <- ggplot.corr(corr.data.pos,
clusters=clusters,
threshold=0,
title="Un-normalized",
coords=coords)
p.corr.pos.DESeq <- ggplot.corr(corr.data.DESeq.pos,
clusters=clusters,
threshold=0,
title="DESeq",
coords=coords)
p.corr.pos.two <- arrangeGrob(p.corr.pos.DESeq + theme(legend.position="none"),
p.corr.pos + theme(legend.position="none"),
get.legend(p.corr.pos.DESeq + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,2), c(4,4)),
heights = c(5,1))
grid.arrange(p.corr.pos.two) # plot
# Negative controls
p.corr.neg <- ggplot.corr(corr.data.neg,
clusters=clusters,
threshold=0,
title="Un-normalized")
p.corr.neg.DESeq <- ggplot.corr(corr.data.DESeq.neg,
clusters=clusters,
threshold=0,
title="DESeq")
p.corr.neg.two <- arrangeGrob(p.corr.neg.DESeq + theme(legend.position="none"),
p.corr.neg + theme(legend.position="none"),
get.legend(p.corr.neg.DESeq + theme(legend.position = "bottom")),
layout_matrix=rbind(c(1,2), c(4,4)),
heights = c(5,1))
grid.arrange(p.corr.neg.two) # plot
## Export Plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrPos_DESeq.pdf"), p.corr.pos.two, width = 5, height=3.5, device="pdf")
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrNeg_DESeq.pdf"), p.corr.neg.two, width = 5, height=3.5, device="pdf")
}
}
if(settings$generate.paper.figs) {
p.corr.noNorm <- ggplot.corr(DANA.results$data.model$pos$corr,
clusters=clusters,
title="un-normalized")
p.corr.TMM <- ggplot.corr(DANA.results$norm.models$TCGA.TMM$pos$corr,
clusters=clusters,
title="TMM")
p.corr.DESeq <- ggplot.corr(DANA.results$norm.models$TCGA.DESeq$pos$corr,
clusters=clusters,
title="DESeq")
p.corr.PoissonSeq <- ggplot.corr(DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
clusters=clusters,
title="PoissonSeq")
p.corr.TC <- ggplot.corr(DANA.results$norm.models$TCGA.TC$pos$corr,
clusters=clusters,
title="TC")
p.corr.Med <- ggplot.corr(DANA.results$norm.models$TCGA.Med$pos$corr,
clusters=clusters,
title="Med")
p.corr.RUVg <- ggplot.corr(DANA.results$norm.models$TCGA.RUVg$pos$corr,
clusters=clusters,
title="RUVg")
p.corr.RUVs <- ggplot.corr(DANA.results$norm.models$TCGA.RUVs$pos$corr,
clusters=clusters,
title="RUVs")
p.corr.RUVr <- ggplot.corr(DANA.results$norm.models$TCGA.RUVr$pos$corr,
clusters=clusters,
title="RUVr")
p.corr.UQ <- ggplot.corr(DANA.results$norm.models$TCGA.UQ$pos$corr,
clusters=clusters,
title="UQ")
p.corr.QN <- ggplot.corr(DANA.results$norm.models$TCGA.QN$pos$corr,
clusters=clusters,
title="QN")
# Arrange plots
p.corr.all <-
plot_grid(p.corr.noNorm + theme(legend.position="none"),
p.corr.TMM + theme(legend.position="none"),
p.corr.DESeq + theme(legend.position="none"),
p.corr.PoissonSeq + theme(legend.position="none"),
p.corr.TC + theme(legend.position="none"),
p.corr.Med + theme(legend.position="none"),
p.corr.RUVg + theme(legend.position="none"),
p.corr.RUVs + theme(legend.position="none"),
p.corr.RUVr + theme(legend.position="none"),
p.corr.UQ + theme(legend.position="none"),
p.corr.QN + theme(legend.position="none"),
get.legend(p.corr.noNorm + theme(legend.position = "bottom")),
ncol=3)
plot(p.corr.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Corr_All.pdf"), p.corr.all, width=9, height=12, device="pdf")
}
}
if(settings$generate.paper.figs) {
# Co-expression models
model <- DANA.results$data.model
norm.model1 <- DANA.results$norm.models$TCGA.DESeq
norm.model2 <- DANA.results$norm.models$TCGA.RUVs
# Set number of genes for negative correlation to minimum
num.genes.plot.neg <- min(dim(model$neg$corr)[1],
dim(norm.model1$neg$corr)[1],
dim(norm.model2$neg$corr)[1])
# Subsetted correlation matrices
corr.neg <- model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.norm1.neg <- norm.model1$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.norm2.neg <- norm.model2$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
## extract non-zero correlations
nnz.corr.neg <- corr.neg[upper.tri(corr.neg)]
nnz.corr.neg <- nnz.corr.neg[nnz.corr.neg != 0]
nnz.corr.norm1.neg <- corr.norm1.neg[upper.tri(corr.norm1.neg)]
nnz.corr.norm1.neg <- nnz.corr.norm1.neg[nnz.corr.norm1.neg != 0]
nnz.corr.norm2.neg <- corr.norm2.neg[upper.tri(corr.norm2.neg)]
nnz.corr.norm2.neg <- nnz.corr.norm2.neg[nnz.corr.norm2.neg != 0]
# Keep positive correlations
nnz.corr.neg <- nnz.corr.neg[nnz.corr.neg > 0]
nnz.corr.norm1.neg <- nnz.corr.norm1.neg[nnz.corr.norm1.neg > 0]
nnz.corr.norm2.neg <- nnz.corr.norm2.neg[nnz.corr.norm2.neg > 0]
## direct comparison of negative controls un-normalized vs. DESeq vs. RUVs
neg.corr <- data.frame(
control=factor(c(rep("Un-normalized", length(nnz.corr.neg)),
rep("DESeq", length(nnz.corr.norm1.neg)),
rep("RUVs", length(nnz.corr.norm2.neg)))),
corr=abs(c(nnz.corr.neg, nnz.corr.norm1.neg, nnz.corr.norm2.neg))
)
# ttt <- data.frame(prop.table(table(method=neg.corr$control, corr=neg.corr$corr), 1))
p.density.neg.nnz.freq <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_freqpoly(binwidth=.05, alpha=.8, size=.9) +
theme_classic() +
xlim(0, 1) +
# scale_linetype_manual(values=c("twodash", "solid", "solid", "solid"))+
scale_color_manual(values=c('#e7298a','#1B9E77','#D95F02')) + # First colors from rcolorbrewer set "Dark2"
theme(legend.position=c(0.9,0.86),
legend.title=element_blank(),
legend.background=element_rect(fill=alpha('white', 0.5))) +
ylab("Frequency") +
xlab("Marginal correlation")#+ ggtitle("Positive correlation among negative controls")
print(p.density.neg.nnz.freq)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrDensityNeg_Freq.pdf"), p.density.neg.nnz.freq, width = 5, height=4, device="pdf")
}
}
## Warning: Removed 6 row(s) containing missing values (geom_path).
if(settings$generate.paper.figs) {
# Co-expression models
model <- DANA.results$data.model
# Set number of genes for negative correlation to minimum
num.genes.plot.neg <- min(dim(model$neg$corr)[1],
sapply(DANA.results$norm.models, function(x) dim(x$neg$corr)[1]))
# Subsetted correlation matrices
corr.neg <- model$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg]
corr.norm.neg <- lapply(DANA.results$norm.models, function(x) x$neg$corr[1:num.genes.plot.neg, 1:num.genes.plot.neg])
## Strictly positive correlations
nnz.corr.neg <- corr.neg[upper.tri(corr.neg)]
nnz.corr.neg <- nnz.corr.neg[nnz.corr.neg > 0]
nnz.corr.norm.neg <- lapply(corr.norm.neg, function(x) x[upper.tri(x)][x[upper.tri(x)] > 0])
names(nnz.corr.norm.neg) <- substr(names(nnz.corr.norm.neg), 6, 20)
control <- c(rep("un-normalized", length(nnz.corr.neg)))
for(i in 1:length(nnz.corr.norm.neg)) {
control <- c(control, rep(names(nnz.corr.norm.neg)[i], length(nnz.corr.norm.neg[[i]])))
}
## direct comparison of negative controls
neg.corr <- data.frame(
control=factor(control),
corr=abs(c(nnz.corr.neg, unname(unlist(nnz.corr.norm.neg))))
)
p.corr.frequency.neg.controls.all <- ggplot(neg.corr, aes(x=corr, colour=control, linetype=control)) +
geom_freqpoly(binwidth=.05, alpha=.9, size=.9) +
theme_classic() +
xlim(0, 1) +
scale_color_manual(values=brewer.pal(n=12, name="Paired")) + # First 2 colors from rcolorbrewer set "Dark2"
theme(legend.position=c(0.85,0.6),
legend.title=element_blank(),
legend.background=element_rect(fill=alpha('white', 0.5))) +
ylab("Frequency") +
xlab("Marginal correlation") + # ggtitle("Positive correlation among negative controls") +
theme(legend.key.width = unit(2,"cm"))
print(p.corr.frequency.neg.controls.all)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_CorrDensityNeg_Freq_All.pdf"), p.corr.frequency.neg.controls.all, width = 8, height=5, device="pdf")
}
}
## Warning: Removed 22 row(s) containing missing values (geom_path).
if(settings$generate.paper.figs) {
## Generate correlation scatter plots for positive controls for TMM and UQ
par(mfrow=c(1,1))
# un-normalized vs RUVr
p.scatter.RUVr <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVr$pos$corr,
clusters=clusters.data,
title="RUVr Normalization",
xlab="un-normalized",
ylab="RUVr",
threshold=0)
print(p.scatter.RUVr)
# un-normalized vs Med
p.scatter.Med <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.Med$pos$corr,
clusters=clusters.data,
title="Med Normalization",
xlab="un-normalized",
ylab="Med",
threshold=0)
print(p.scatter.Med)
# Save plots
if(settings$export.figures) {
# ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_ScatterRUVr.pdf"), p.scatter.RUVr, width=4, height=4, device="pdf")
# ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_ScatterMed.pdf"), p.scatter.Med, width=4, height=4, device="pdf")
}
}
if(settings$generate.paper.figs) {
par(mfrow=c(1,1))
# un-normalized vs TMM
p.scatter.TMM <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.TMM$pos$corr,
clusters=clusters, title="TMM", xlab="un-normalized", ylab="TMM",
ccc=round(DANA.results$metrics["TCGA.TMM", "cc"],3))
# un-normalized vs DESeq
p.scatter.DESeq <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.DESeq$pos$corr,
clusters=clusters, title="DESeq", xlab="un-normalized", ylab="DESeq",
ccc=round(DANA.results$metrics["TCGA.DESeq", "cc"],3))
# un-normalized vs PoissonSeq
p.scatter.PoissonSeq <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.PoissonSeq$pos$corr,
clusters=clusters, title="PoissonSeq", xlab="un-normalized", ylab="PoissonSeq",
ccc=round(DANA.results$metrics["TCGA.PoissonSeq", "cc"],3))
# un-normalized vs TC
p.scatter.TC <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.TC$pos$corr,
clusters=clusters, title="TC", xlab="un-normalized", ylab="TC",
ccc=round(DANA.results$metrics["TCGA.TC", "cc"],3))
# un-normalized vs Med
p.scatter.Med <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.Med$pos$corr,
clusters=clusters, title="Med", xlab="un-normalized", ylab="Med",
ccc=round(DANA.results$metrics["TCGA.Med", "cc"],3))
# un-normalized vs RUVg
p.scatter.RUVg <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVg$pos$corr,
clusters=clusters, title="RUVg", xlab="un-normalized", ylab="RUVg",
ccc=round(DANA.results$metrics["TCGA.RUVg", "cc"],3))
# un-normalized vs RUVs
p.scatter.RUVs <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVs$pos$corr,
clusters=clusters, title="RUVs", xlab="un-normalized", ylab="RUVs",
ccc=round(DANA.results$metrics["TCGA.RUVs", "cc"],3))
# un-normalized vs RUVr
p.scatter.RUVr <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.RUVr$pos$corr,
clusters=clusters, title="RUVr", xlab="un-normalized", ylab="RUVr",
ccc=round(DANA.results$metrics["TCGA.RUVr", "cc"],3))
# un-normalized vs UQ
p.scatter.UQ <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.UQ$pos$corr,
clusters=clusters, title="UQ", xlab="un-normalized", ylab="UQ",
ccc=round(DANA.results$metrics["TCGA.UQ", "cc"],3))
# un-normalized vs QN
p.scatter.QN <- plot.corr.scatter(
corr1=DANA.results$data.model$pos$corr,
corr2=DANA.results$norm.models$TCGA.QN$pos$corr,
clusters=clusters, title="QN", xlab="un-normalized", ylab="QN",
ccc=round(DANA.results$metrics["TCGA.QN", "cc"],3))
p.scatter.TCGA.all <- plot_grid(p.scatter.TMM,
p.scatter.DESeq,
p.scatter.PoissonSeq,
p.scatter.TC,
p.scatter.Med,
p.scatter.RUVg,
p.scatter.RUVs,
p.scatter.RUVr,
p.scatter.UQ,
p.scatter.QN,
ncol = 3,
align="hv")
plot(p.scatter.TCGA.all)
# Save plots
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_Scatter_All.pdf"), p.scatter.TCGA.all, width=9, height=12, device="pdf")
}
}
## Warning in plot.corr.scatter(corr1 = DANA.results$data.model$pos$corr, corr2 = DANA.results$norm.models$TCGA.RUVs$pos$corr, : Warning: Shapes of correlation matrices in plot.corr.scatter do not match!
## Reducing correlation matrices to all mutual genes.
if(settings$generate.paper.figs) {
options(scipen=4) # force non-scientific notation of x axis
test.DANA.metrics <- data.frame(
method=substr(rownames(DANA.results$metrics), 6, 20),
cc=DANA.results$metrics[, "cc"],
mcr=DANA.results$metrics[, "mcr"]
)
p.DANA.metrics <- ggplot(test.DANA.metrics,aes(x=mcr,y=cc, label=method)) +
geom_point(alpha=.5) +
theme_classic() +
xlab(TeX("$mcr^-$; Relative reduction of handling effects")) +
ylab(TeX("$cc^+$; Biological signal preservation")) +
geom_text_repel(aes(label = method), size=3, max.overlaps = Inf, min.segment.length = 0.2) +
# xlim(c(min(c(0, test.mposc.reduction)),max(test.mposc.reduction))) +
scale_x_continuous(labels = scales::percent, limits=c(0,1.05),
breaks = scales::pretty_breaks(n = 5)) +
# ylim(c(0,1))
scale_y_continuous(labels = scales::percent, limits = c(0,1))
print(p.DANA.metrics)
print("DANA Statistics for the TCGA-BRCA/UCS")
test.DANA.metrics$cc <- round(test.DANA.metrics$cc,3)
test.DANA.metrics$mcr <- round(test.DANA.metrics$mcr,3)
print(test.DANA.metrics)
if(settings$export.figures) {
ggsave(paste0(settings$paper.fig.path, "BRCA_UCS_DANAMetrics.pdf"), p.DANA.metrics, width=4.5, height=3.5, device="pdf")
}
}
## [1] "DANA Statistics for the TCGA-BRCA/UCS"
## method cc mcr
## 1 TMM 0.990 0.715
## 2 DESeq 0.990 0.733
## 3 PoissonSeq 0.988 0.717
## 4 TC 0.993 0.534
## 5 Med 0.924 0.363
## 6 RUVg 0.985 0.717
## 7 RUVs 0.829 0.812
## 8 RUVr 0.976 0.777
## 9 UQ 0.937 0.398
## 10 QN 0.972 0.783
sessionInfo()
## R version 4.0.2 (2020-06-22)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=German_Germany.1252 LC_CTYPE=German_Germany.1252
## [3] LC_MONETARY=German_Germany.1252 LC_NUMERIC=C
## [5] LC_TIME=German_Germany.1252
##
## attached base packages:
## [1] stats4 splines parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] ggcorrplot_0.1.3 latex2exp_0.4.0
## [3] RColorBrewer_1.1-2 cowplot_1.1.0
## [5] openxlsx_4.2.2 ffpe_1.32.0
## [7] TTR_0.24.2 DescTools_0.99.38
## [9] vsn_3.56.0 RUVSeq_1.22.0
## [11] EDASeq_2.22.0 ShortRead_1.46.0
## [13] GenomicAlignments_1.24.0 SummarizedExperiment_1.18.2
## [15] DelayedArray_0.14.1 matrixStats_0.56.0
## [17] Rsamtools_2.4.0 GenomicRanges_1.40.0
## [19] GenomeInfoDb_1.24.2 Biostrings_2.56.0
## [21] XVector_0.28.0 IRanges_2.22.2
## [23] S4Vectors_0.26.1 sva_3.36.0
## [25] BiocParallel_1.22.0 genefilter_1.70.0
## [27] mgcv_1.8-33 nlme_3.1-149
## [29] PoissonSeq_1.1.2 combinat_0.0-8
## [31] DESeq_1.39.0 lattice_0.20-41
## [33] locfit_1.5-9.4 Biobase_2.48.0
## [35] BiocGenerics_0.34.0 edgeR_3.30.3
## [37] limma_3.44.3 FastGGM_1.0
## [39] Rcpp_1.0.5 huge_1.3.4.1
## [41] glmnet_4.1 Matrix_1.2-18
## [43] ggrepel_0.9.1 plotly_4.9.3
## [45] stargazer_5.2.2 corrplot_0.84
## [47] ggnewscale_0.4.5 gridExtra_2.3
## [49] ggplot2_3.3.3
##
## loaded via a namespace (and not attached):
## [1] R.utils_2.10.1 tidyselect_1.1.0
## [3] RSQLite_2.2.0 AnnotationDbi_1.50.3
## [5] htmlwidgets_1.5.3 grid_4.0.2
## [7] munsell_0.5.0 codetools_0.2-16
## [9] preprocessCore_1.50.0 nleqslv_3.3.2
## [11] withr_2.3.0 colorspace_1.4-1
## [13] knitr_1.30 rstudioapi_0.11
## [15] labeling_0.4.2 GenomeInfoDbData_1.2.3
## [17] hwriter_1.3.2 farver_2.0.3
## [19] bit64_4.0.5 rhdf5_2.32.4
## [21] vctrs_0.3.4 generics_0.0.2
## [23] xfun_0.17 BiocFileCache_1.12.1
## [25] R6_2.4.1 illuminaio_0.30.0
## [27] bitops_1.0-6 reshape_0.8.8
## [29] assertthat_0.2.1 scales_1.1.1
## [31] rootSolve_1.8.2.1 gtable_0.3.0
## [33] affy_1.66.0 methylumi_2.34.0
## [35] lmom_2.8 rlang_0.4.7
## [37] rtracklayer_1.48.0 lazyeval_0.2.2
## [39] GEOquery_2.56.0 BiocManager_1.30.10
## [41] yaml_2.2.1 crosstalk_1.1.0.1
## [43] GenomicFeatures_1.40.1 tools_4.0.2
## [45] nor1mix_1.3-0 affyio_1.58.0
## [47] ellipsis_0.3.1 lumi_2.40.0
## [49] siggenes_1.62.0 plyr_1.8.6
## [51] progress_1.2.2 zlibbioc_1.34.0
## [53] purrr_0.3.4 RCurl_1.98-1.2
## [55] prettyunits_1.1.1 openssl_1.4.3
## [57] bumphunter_1.30.0 zoo_1.8-8
## [59] sfsmisc_1.1-7 magrittr_1.5
## [61] data.table_1.13.2 mvtnorm_1.1-1
## [63] aroma.light_3.18.0 hms_0.5.3
## [65] evaluate_0.14 xtable_1.8-4
## [67] XML_3.99-0.5 jpeg_0.1-8.1
## [69] mclust_5.4.6 shape_1.4.5
## [71] compiler_4.0.2 biomaRt_2.44.4
## [73] minfi_1.34.0 tibble_3.0.3
## [75] KernSmooth_2.23-17 crayon_1.3.4
## [77] R.oo_1.24.0 htmltools_0.5.0
## [79] tidyr_1.1.2 geneplotter_1.66.0
## [81] expm_0.999-5 Exact_2.1
## [83] RcppParallel_5.0.2 DBI_1.1.0
## [85] dbplyr_1.4.4 MASS_7.3-53
## [87] rappdirs_0.3.1 boot_1.3-25
## [89] readr_1.4.0 quadprog_1.5-8
## [91] R.methodsS3_1.8.1 igraph_1.2.6
## [93] pkgconfig_2.0.3 xml2_1.3.2
## [95] foreach_1.5.1 annotate_1.66.0
## [97] rngtools_1.5 multtest_2.44.0
## [99] beanplot_1.2 doRNG_1.8.2
## [101] scrime_1.3.5 stringr_1.4.0
## [103] digest_0.6.25 rmarkdown_2.4
## [105] base64_2.0 gld_2.6.2
## [107] DelayedMatrixStats_1.10.1 curl_4.3
## [109] lifecycle_0.2.0 jsonlite_1.7.1
## [111] Rhdf5lib_1.10.1 viridisLite_0.3.0
## [113] askpass_1.1 pillar_1.4.6
## [115] httr_1.4.2 survival_3.2-3
## [117] glue_1.4.2 xts_0.12.1
## [119] zip_2.1.1 png_0.1-7
## [121] iterators_1.0.13 bit_4.0.4
## [123] class_7.3-17 stringi_1.5.3
## [125] HDF5Array_1.16.1 blob_1.2.1
## [127] latticeExtra_0.6-29 memoise_1.1.0
## [129] dplyr_1.0.2 e1071_1.7-4